In [2]:
#Approach 1: Using Word Cloud
!pip install wordcloud
Requirement already satisfied: wordcloud in c:\users\yvkch\anaconda4\lib\site-packages (1.7.0)
Requirement already satisfied: matplotlib in c:\users\yvkch\anaconda4\lib\site-packages (from wordcloud) (3.1.3)
Requirement already satisfied: pillow in c:\users\yvkch\anaconda4\lib\site-packages (from wordcloud) (7.2.0)
Requirement already satisfied: numpy>=1.6.1 in c:\users\yvkch\anaconda4\lib\site-packages (from wordcloud) (1.18.1)
Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in c:\users\yvkch\anaconda4\lib\site-packages (from matplotlib->wordcloud) (2.4.6)
Requirement already satisfied: kiwisolver>=1.0.1 in c:\users\yvkch\anaconda4\lib\site-packages (from matplotlib->wordcloud) (1.1.0)
Requirement already satisfied: cycler>=0.10 in c:\users\yvkch\anaconda4\lib\site-packages (from matplotlib->wordcloud) (0.10.0)
Requirement already satisfied: python-dateutil>=2.1 in c:\users\yvkch\anaconda4\lib\site-packages (from matplotlib->wordcloud) (2.8.1)
Requirement already satisfied: setuptools in c:\users\yvkch\anaconda4\lib\site-packages (from kiwisolver>=1.0.1->matplotlib->wordcloud) (45.2.0.post20200210)
Requirement already satisfied: six in c:\users\yvkch\anaconda4\lib\site-packages (from cycler>=0.10->matplotlib->wordcloud) (1.14.0)
WARNING: You are using pip version 20.1.1; however, version 20.2 is available.
You should consider upgrading via the 'c:\users\yvkch\anaconda4\python.exe -m pip install --upgrade pip' command.
In [1]:
# Importing the necessery modules 
from wordcloud import WordCloud, STOPWORDS 
import matplotlib.pyplot as plt 
import csv 
  
# file object is created 
file_ob = open(r"C:\\Users\YVKCH\Desktop\Python Codes\\amazon_reviews.csv") 

stopwords = set(STOPWORDS) 

stopwords1=list(stopwords)
print(stopwords1)

print(type(stopwords1))

stopwords2=["for", "account", "not","on","FOR", "ACCOUNT","NOT","ON","is","This","For","The","the","hi","It","it","and"
           , "And","this","to","It is"]

print(type(stopwords2))

stopwords_final=stopwords1+stopwords2

print(stopwords_final)
 
# reader object is created 
reader_ob = csv.reader(file_ob) 
  
# contents of reader object is stored . 
# data is stored in list of list format. 
reader_contents = list(reader_ob) 
  
# empty string is declare 
text = "" 
  
# iterating through list of rows 
for row in reader_contents : 
      
    # iterating through words in the row 
    for word in row : 
  
        # concatenate the words 
        text = text + " " + word 
        
wordcloud = WordCloud(width=1000, height=1000, 
            stopwords=stopwords_final, background_color='white').generate(text) #stopwords_final is enetered into wordcloud
  

# plot the WordCloud image                        
plt.figure(figsize = (8, 8), facecolor = None) 
plt.imshow(wordcloud) 
plt.axis("off") 
plt.tight_layout(pad = 0) 
  
plt.show()
['few', 'an', "aren't", "we're", 'ever', 'and', 'about', 'however', 'during', 'here', 'yourselves', "isn't", 'such', "we'll", "you've", 'because', 'theirs', 'themselves', 'his', 'is', 'over', 'our', "won't", 'if', "she's", "they're", 'also', 'would', "hadn't", 'him', "it's", 'k', 'on', "shan't", 'with', 'those', 'they', 'what', "they'll", 'of', "who's", "haven't", 'same', "he'd", 'are', 'having', 'but', "weren't", 'from', "he's", 'or', 'only', 'more', "doesn't", 'am', "we'd", "here's", 'himself', 'hence', 'between', "wouldn't", 'other', 'else', 'into', 'therefore', 'does', "he'll", 'could', 'have', 'before', "don't", 'in', 'doing', 'to', "couldn't", "hasn't", 'ought', 'had', "let's", 'for', 'its', 'above', "we've", 'ourselves', 'you', 'when', 'them', 'we', "mustn't", 'me', 'these', 'against', 'being', 'ours', 'very', 'there', "what's", "you're", 'should', 'can', "she'll", 'be', 'i', 'were', 'it', "you'd", 'than', 'that', 'off', 'some', 'any', "that's", 'yourself', 'r', 'com', 'shall', 'too', 'a', 'once', "where's", 'not', 'further', 'yours', "i'd", 'until', 'her', 'www', 'otherwise', 'below', "i'm", "why's", "how's", 'then', "can't", "shouldn't", "she'd", "didn't", 'http', 'my', 'she', 'has', 'he', 'just', 'out', 'while', 'as', 'nor', 'since', 'by', 'herself', 'like', "wasn't", 'the', 'where', 'all', "you'll", 'myself', 'which', 'cannot', 'both', "when's", 'been', "they've", 'through', 'so', 'no', 'down', "i'll", 'was', "there's", 'again', 'after', 'their', 'itself', 'do', "they'd", 'each', 'own', 'whom', 'why', 'who', 'get', 'your', "i've", 'up', 'under', 'most', 'how', 'hers', 'this', 'did', 'at']
<class 'list'>
<class 'list'>
['few', 'an', "aren't", "we're", 'ever', 'and', 'about', 'however', 'during', 'here', 'yourselves', "isn't", 'such', "we'll", "you've", 'because', 'theirs', 'themselves', 'his', 'is', 'over', 'our', "won't", 'if', "she's", "they're", 'also', 'would', "hadn't", 'him', "it's", 'k', 'on', "shan't", 'with', 'those', 'they', 'what', "they'll", 'of', "who's", "haven't", 'same', "he'd", 'are', 'having', 'but', "weren't", 'from', "he's", 'or', 'only', 'more', "doesn't", 'am', "we'd", "here's", 'himself', 'hence', 'between', "wouldn't", 'other', 'else', 'into', 'therefore', 'does', "he'll", 'could', 'have', 'before', "don't", 'in', 'doing', 'to', "couldn't", "hasn't", 'ought', 'had', "let's", 'for', 'its', 'above', "we've", 'ourselves', 'you', 'when', 'them', 'we', "mustn't", 'me', 'these', 'against', 'being', 'ours', 'very', 'there', "what's", "you're", 'should', 'can', "she'll", 'be', 'i', 'were', 'it', "you'd", 'than', 'that', 'off', 'some', 'any', "that's", 'yourself', 'r', 'com', 'shall', 'too', 'a', 'once', "where's", 'not', 'further', 'yours', "i'd", 'until', 'her', 'www', 'otherwise', 'below', "i'm", "why's", "how's", 'then', "can't", "shouldn't", "she'd", "didn't", 'http', 'my', 'she', 'has', 'he', 'just', 'out', 'while', 'as', 'nor', 'since', 'by', 'herself', 'like', "wasn't", 'the', 'where', 'all', "you'll", 'myself', 'which', 'cannot', 'both', "when's", 'been', "they've", 'through', 'so', 'no', 'down', "i'll", 'was', "there's", 'again', 'after', 'their', 'itself', 'do', "they'd", 'each', 'own', 'whom', 'why', 'who', 'get', 'your', "i've", 'up', 'under', 'most', 'how', 'hers', 'this', 'did', 'at', 'for', 'account', 'not', 'on', 'FOR', 'ACCOUNT', 'NOT', 'ON', 'is', 'This', 'For', 'The', 'the', 'hi', 'It', 'it', 'and', 'And', 'this', 'to', 'It is']
In [ ]:
#Insights from the above WordCloud
#1)Most customers had positive reviews about the products (identified via words- Great, Loves, easy to use)
#2)It looked like customers presented Electronics as gifts to son/daughter (many positive reviews expressed by customers are tied to son/daughter)
In [3]:
#Approach 2: TextHero
!pip install texthero
Collecting texthero
  Downloading texthero-1.0.9-py3-none-any.whl (25 kB)
Requirement already satisfied: wordcloud>=1.5.0 in c:\users\yvkch\anaconda4\lib\site-packages (from texthero) (1.7.0)
Collecting unidecode>=1.1.1
  Downloading Unidecode-1.1.1-py2.py3-none-any.whl (238 kB)
Requirement already satisfied: plotly>=4.2.0 in c:\users\yvkch\anaconda4\lib\site-packages (from texthero) (4.4.1)
Requirement already satisfied: gensim>=3.6.0 in c:\users\yvkch\anaconda4\lib\site-packages (from texthero) (3.8.3)
Requirement already satisfied: tqdm>=4.3 in c:\users\yvkch\anaconda4\lib\site-packages (from texthero) (4.48.0)
Requirement already satisfied: matplotlib>=3.1.0 in c:\users\yvkch\anaconda4\lib\site-packages (from texthero) (3.1.3)
Requirement already satisfied: spacy>=2.2.2 in c:\users\yvkch\anaconda4\lib\site-packages (from texthero) (2.2.4)
Requirement already satisfied: pandas>=1.0.2 in c:\users\yvkch\anaconda4\lib\site-packages (from texthero) (1.0.5)
Requirement already satisfied: numpy>=1.17 in c:\users\yvkch\anaconda4\lib\site-packages (from texthero) (1.18.1)
Requirement already satisfied: nltk>=3.3 in c:\users\yvkch\anaconda4\lib\site-packages (from texthero) (3.5)
Requirement already satisfied: scikit-learn>=0.22 in c:\users\yvkch\anaconda4\lib\site-packages (from texthero) (0.22)
Requirement already satisfied: pillow in c:\users\yvkch\anaconda4\lib\site-packages (from wordcloud>=1.5.0->texthero) (7.2.0)
Requirement already satisfied: six in c:\users\yvkch\anaconda4\lib\site-packages (from plotly>=4.2.0->texthero) (1.14.0)
Requirement already satisfied: retrying>=1.3.3 in c:\users\yvkch\anaconda4\lib\site-packages (from plotly>=4.2.0->texthero) (1.3.3)
Requirement already satisfied: smart-open>=1.8.1 in c:\users\yvkch\anaconda4\lib\site-packages (from gensim>=3.6.0->texthero) (2.0.0)
Requirement already satisfied: Cython==0.29.14 in c:\users\yvkch\anaconda4\lib\site-packages (from gensim>=3.6.0->texthero) (0.29.14)
Requirement already satisfied: scipy>=0.18.1 in c:\users\yvkch\anaconda4\lib\site-packages (from gensim>=3.6.0->texthero) (1.4.1)
Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in c:\users\yvkch\anaconda4\lib\site-packages (from matplotlib>=3.1.0->texthero) (2.4.6)
Requirement already satisfied: kiwisolver>=1.0.1 in c:\users\yvkch\anaconda4\lib\site-packages (from matplotlib>=3.1.0->texthero) (1.1.0)
Requirement already satisfied: cycler>=0.10 in c:\users\yvkch\anaconda4\lib\site-packages (from matplotlib>=3.1.0->texthero) (0.10.0)
Requirement already satisfied: python-dateutil>=2.1 in c:\users\yvkch\anaconda4\lib\site-packages (from matplotlib>=3.1.0->texthero) (2.8.1)
Requirement already satisfied: plac<1.2.0,>=0.9.6 in c:\users\yvkch\anaconda4\lib\site-packages (from spacy>=2.2.2->texthero) (1.1.3)
Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in c:\users\yvkch\anaconda4\lib\site-packages (from spacy>=2.2.2->texthero) (1.0.2)
Requirement already satisfied: catalogue<1.1.0,>=0.0.7 in c:\users\yvkch\anaconda4\lib\site-packages (from spacy>=2.2.2->texthero) (1.0.0)
Requirement already satisfied: setuptools in c:\users\yvkch\anaconda4\lib\site-packages (from spacy>=2.2.2->texthero) (45.2.0.post20200210)
Requirement already satisfied: preshed<3.1.0,>=3.0.2 in c:\users\yvkch\anaconda4\lib\site-packages (from spacy>=2.2.2->texthero) (3.0.2)
Requirement already satisfied: cymem<2.1.0,>=2.0.2 in c:\users\yvkch\anaconda4\lib\site-packages (from spacy>=2.2.2->texthero) (2.0.3)
Requirement already satisfied: requests<3.0.0,>=2.13.0 in c:\users\yvkch\anaconda4\lib\site-packages (from spacy>=2.2.2->texthero) (2.23.0)
Requirement already satisfied: wasabi<1.1.0,>=0.4.0 in c:\users\yvkch\anaconda4\lib\site-packages (from spacy>=2.2.2->texthero) (0.6.0)
Requirement already satisfied: blis<0.5.0,>=0.4.0 in c:\users\yvkch\anaconda4\lib\site-packages (from spacy>=2.2.2->texthero) (0.4.1)
Requirement already satisfied: srsly<1.1.0,>=1.0.2 in c:\users\yvkch\anaconda4\lib\site-packages (from spacy>=2.2.2->texthero) (1.0.2)
Requirement already satisfied: thinc==7.4.0 in c:\users\yvkch\anaconda4\lib\site-packages (from spacy>=2.2.2->texthero) (7.4.0)
Requirement already satisfied: pytz>=2017.2 in c:\users\yvkch\anaconda4\lib\site-packages (from pandas>=1.0.2->texthero) (2019.3)
Requirement already satisfied: click in c:\users\yvkch\anaconda4\lib\site-packages (from nltk>=3.3->texthero) (7.0)
Requirement already satisfied: joblib in c:\users\yvkch\anaconda4\lib\site-packages (from nltk>=3.3->texthero) (0.14.1)
Requirement already satisfied: regex in c:\users\yvkch\anaconda4\lib\site-packages (from nltk>=3.3->texthero) (2020.5.14)
Requirement already satisfied: boto3 in c:\users\yvkch\anaconda4\lib\site-packages (from smart-open>=1.8.1->gensim>=3.6.0->texthero) (1.13.16)
Requirement already satisfied: boto in c:\users\yvkch\anaconda4\lib\site-packages (from smart-open>=1.8.1->gensim>=3.6.0->texthero) (2.49.0)
Requirement already satisfied: importlib-metadata>=0.20; python_version < "3.8" in c:\users\yvkch\anaconda4\lib\site-packages (from catalogue<1.1.0,>=0.0.7->spacy>=2.2.2->texthero) (1.5.0)
Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in c:\users\yvkch\anaconda4\lib\site-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.2->texthero) (1.25.8)
Requirement already satisfied: idna<3,>=2.5 in c:\users\yvkch\anaconda4\lib\site-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.2->texthero) (2.8)
Requirement already satisfied: chardet<4,>=3.0.2 in c:\users\yvkch\anaconda4\lib\site-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.2->texthero) (3.0.4)
Requirement already satisfied: certifi>=2017.4.17 in c:\users\yvkch\anaconda4\lib\site-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.2->texthero) (2019.11.28)
Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in c:\users\yvkch\anaconda4\lib\site-packages (from boto3->smart-open>=1.8.1->gensim>=3.6.0->texthero) (0.10.0)
Requirement already satisfied: s3transfer<0.4.0,>=0.3.0 in c:\users\yvkch\anaconda4\lib\site-packages (from boto3->smart-open>=1.8.1->gensim>=3.6.0->texthero) (0.3.3)
Requirement already satisfied: botocore<1.17.0,>=1.16.16 in c:\users\yvkch\anaconda4\lib\site-packages (from boto3->smart-open>=1.8.1->gensim>=3.6.0->texthero) (1.16.16)
Requirement already satisfied: zipp>=0.5 in c:\users\yvkch\anaconda4\lib\site-packages (from importlib-metadata>=0.20; python_version < "3.8"->catalogue<1.1.0,>=0.0.7->spacy>=2.2.2->texthero) (2.2.0)
Requirement already satisfied: docutils<0.16,>=0.10 in c:\users\yvkch\anaconda4\lib\site-packages (from botocore<1.17.0,>=1.16.16->boto3->smart-open>=1.8.1->gensim>=3.6.0->texthero) (0.15.2)
Installing collected packages: unidecode, texthero
Successfully installed texthero-1.0.9 unidecode-1.1.1
WARNING: You are using pip version 20.1.1; however, version 20.2 is available.
You should consider upgrading via the 'c:\users\yvkch\anaconda4\python.exe -m pip install --upgrade pip' command.
In [4]:
import texthero as hero
import pandas as pd
import csv
import matplotlib as plt
✔ Download and installation successful
You can now load the model via spacy.load('en_core_web_sm')
In [6]:
#Importing only the Product Name and the textual reviews of the customer into the Dataframe for Text Mining
#I have encountered utf-8 encoding error when I read the file. Using encoding='cp1252' resolved the error
df = pd.read_csv('Amazon_Textual_Reviews_Electronics.csv',encoding='cp1252')
In [7]:
#Sample 5 records from the file
df.head(5)
Out[7]:
Product Review
0 Amazon Kindle E-Reader I thought it would be as big as small paper bu...
1 Amazon Kindle E-Reader This kindle is light and easy to use especiall...
2 Amazon Kindle E-Reader Didnt know how much i'd use a kindle so went f...
3 Amazon Kindle E-Reader I am 100 happy with my purchase. I caught it o...
4 Amazon Kindle E-Reader Solid entry level Kindle. Great for kids. Gift...
In [8]:
#Step 1: Cleaning the textual reviews with hero.clean
df['clean_text'] = hero.clean(df['Review'])
In [9]:
#Step 2: This step performs the same cleaning as Step 1. However, it takes into account only certain pre-processing cleaning steps as indicated "custom"
from texthero import preprocessing

custom       = [preprocessing.fillna,
                   preprocessing.lowercase,
                   preprocessing.remove_whitespace,
                   preprocessing.remove_digits,
                   preprocessing.remove_punctuation,
                   preprocessing.remove_diacritics,
                   preprocessing.remove_stopwords,
                   preprocessing.stem]
df['clean_text'] = hero.clean(df['Review'])
In [10]:
#Step 3: Adding Term Frequency vector and PCA values into the data
df['tfidf_clean_text'] = hero.tfidf(df['clean_text'])
df['pca_tfidf_clean_text'] = hero.pca(df['tfidf_clean_text'])
In [11]:
#Step 4: This step peforms the same function as Step1 and Step 3 combined together with the help of pipe function in pandas
df['pca'] = (
    
            df['clean_text']
            .pipe(hero.clean)
            .pipe(hero.tfidf)
            .pipe(hero.pca)
   )
In [12]:
df.head()
Out[12]:
Product Review clean_text tfidf_clean_text pca_tfidf_clean_text pca
0 Amazon Kindle E-Reader I thought it would be as big as small paper bu... thought would big small paper turn like palm t... [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ... [-0.05985380639526171, -0.04738335280815571] [-0.06024606227635409, -0.04599464367479512]
1 Amazon Kindle E-Reader This kindle is light and easy to use especiall... kindle light easy use especially beach [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ... [0.03469439601093114, 0.12466726411412628] [0.03588899679974371, 0.12458660635788087]
2 Amazon Kindle E-Reader Didnt know how much i'd use a kindle so went f... didnt know much use kindle went lower end im h... [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ... [-0.01581452304997732, -0.033277039593171534] [-0.015504915197290388, -0.03296878618081514]
3 Amazon Kindle E-Reader I am 100 happy with my purchase. I caught it o... happy purchase caught sale really good price n... [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ... [0.05941467116532581, -0.10781235560884742] [0.05880920095144095, -0.10801794400435843]
4 Amazon Kindle E-Reader Solid entry level Kindle. Great for kids. Gift... solid entry level kindle great kids gifted kid... [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ... [-0.026962398301289853, 0.061587518368248585] [-0.025818024075362156, 0.06234777834962251]
In [14]:
#Plot 1
#Scatter plot of PCA vector values
hero.scatterplot(df, col='pca', color='Product', title="PCA Amazon Product Reviews")
In [15]:
#Pulling the top words by product
NUM_TOP_WORDS = 10
df.groupby('Product')['clean_text'].apply(lambda x: hero.top_words(x)[:NUM_TOP_WORDS])
#You can see that the top 10 words are pulled for each different product

#If you take the first product, All-New Fire HD 8 Tablet- you can see that customers feel great about the product and its easy to use
Out[15]:
Product                        
 Amazon Kindle Fire     tablet     212
                        great      150
                        bought      99
                        one         95
                        use         95
                                  ... 
Kindle Oasis E-reader   would       30
                        charge      27
                        light       26
                        reading     26
                        read        25
Name: clean_text, Length: 110, dtype: int64
In [16]:
#Plot 2: Plotting K-means
#pipe is the Pandas function used when chaining together functions

df['tfidf2'] = (
    df['Review']
    .pipe(hero.clean)
    .pipe(hero.tfidf)
)
df['kmeans_labels'] = (
    df['tfidf2']
    .pipe(hero.kmeans, n_clusters=5) #Defining 5 clusters (Forcing the algorithm to use only 5 clusters)
    .astype(str)
)
df['pca_k'] = df['tfidf2'].pipe(hero.pca)
hero.scatterplot(df, 'pca_k', color='kmeans_labels', title="K-means Plot for Amazon Customer Reviews")
In [ ]: